In [1]:
import os
import json
import pandas as pd
from sdv.utils import display_tables
from sdv.evaluation import evaluate
from sdmetrics.reports.multi_table import QualityReport
from sdmetrics.reports import utils
In [2]:
def read_tables(path):
    tables = {}
    for file in os.listdir(path):
        if file.endswith(".csv"):
            table_name = file[:-4].split("-")[-1]
            table = pd.read_csv(path + file)
            tables[table_name] = table
    return tables
In [3]:
real_data = read_tables("../data/zurich_source/")
synthetic_data = read_tables("../data/zurich_synthetic/")
In [4]:
# read metadata from ../models/zurich-sdv-metadata.json
metadata_name = '../models/zurich-sdv-1.0-metadata.json'
with open(metadata_name) as metadata_file:
    metadata = json.load(metadata_file)

The complete list of possible metrics is:

  • cstest: This metric compares the distributions of all the categorical columns of the table by using a Chi-squared test and returns the average of the p-values obtained across all the columns. If the tables that you are evaluating do not contain any categorical columns the result will be nan.
  • kscomplement: This metric compares the distributions of all the numerical columns of the table with a two-sample Kolmogorov-Smirnov test using the empirical CDF and returns the average of the KS statistic values obtained across all the columns. If the tables that you are evaluating do not contain any numerical columns the result will be nan.
  • logistic_detection: This metric tries to use a Logistic Regression classifier to detect whether each row is real or synthetic and then evaluates its performance using an Area under the ROC curve metric. The returned score is 1 minus the ROC AUC score obtained by the classifier.
  • svc_detection: This metric tries to use an Support Vector Classifier to detect whether each row is real or synthetic and then evaluates its performance using an Area under the ROC curve metric. The returned score is 1 minus the ROC AUC score obtained by the classifier.
In [5]:
evaluate(
    synthetic_data,
    real_data,
    aggregate=False,
    metadata=metadata)
Out[5]:
metric name raw_score normalized_score min_value max_value goal error
0 KSComplement Inverted Kolmogorov-Smirnov D statistic 0.547242 0.547242 0.0 1.0 MAXIMIZE None
1 CSTest Chi-Squared 0.975976 0.975976 0.0 1.0 MAXIMIZE None

SDMetrics library¶

In [6]:
report = QualityReport()
report.generate(real_data, synthetic_data, metadata)
Creating report:   0%|                                                                      | 0/5 [00:00<?, ?it/s]/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite.
  warnings.warn(ConstantInputWarning(msg))
Creating report: 100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  9.51it/s]
Overall Quality Score: 64.32%

Properties:
Column Shapes: 61.69%
Column Pair Trends: 67.55%
Parent Child Relationships: 63.71%
In [7]:
# save report to file
# report.save(filepath='../reports/quality_report-zurich-sdmetrics-1.0.pkl')
In [8]:
report.get_details(
    property_name='Column Shapes',
    table_name='customers')
Out[8]:
Table Column Metric Quality Score
0 customers date_of_birth KSComplement 0.798197
1 customers household_id KSComplement 0.500711
2 customers customer_type TVComplement 0.984283
3 customers gender TVComplement 0.876720
4 customers country_part TVComplement 0.909256
5 customers household_role TVComplement 0.447368
In [9]:
fig = report.get_visualization(
    property_name='Column Shapes',
    table_name='customers')

fig.show()
In [10]:
fig = report.get_visualization(
    property_name='Column Pair Trends',
    table_name='customers')

fig.show()
In [11]:
fig = report.get_visualization(
    property_name='Column Shapes',
    table_name='policy')

fig.show()
In [12]:
fig = report.get_visualization(
    property_name='Column Pair Trends',
    table_name='policy')

fig.show()
In [13]:
fig = report.get_visualization(
    property_name='Column Shapes',
    table_name='claim')

fig.show()
In [14]:
fig = report.get_visualization(
    property_name='Column Pair Trends',
    table_name='claim')

fig.show()
In [15]:
fig = report.get_visualization(
    property_name='Parent Child Relationships')

fig.show()
In [16]:
# plot for all columns in data
def plot_all_columns(real_data, synthetic_data, metadata):
    for table_name, table in real_data.items():
        for column_name in table.columns:
            try:
                fig = utils.get_column_plot(
                    real_data=real_data[table_name],
                    synthetic_data=synthetic_data[table_name],
                    column_name=column_name,
                    metadata=metadata['tables'][table_name]
                )
                fig.show()
            except Exception as e:
                print(f"Could not plot {table_name}.{column_name}")
                print(e)
In [17]:
plot_all_columns(real_data, synthetic_data, metadata)
Could not plot customers.customer_id
sdtype of type 'id' not recognized.
Could not plot claim.claim_recovered
The data appears to lie in a lower-dimensional subspace of the space in which it is expressed. This has resulted in a singular data covariance matrix, which cannot be treated using the algorithms implemented in `gaussian_kde`. Consider performing principle component analysis / dimensionality reduction and using `gaussian_kde` with the transformed data.
Could not plot claim.policy_id
sdtype of type 'id' not recognized.
Could not plot claim.claim_id
sdtype of type 'id' not recognized.
Could not plot policy.policy_id
sdtype of type 'id' not recognized.
Could not plot policy.customer_id
sdtype of type 'id' not recognized.
In [18]:
real_data['customers'].head()
Out[18]:
customer_id customer_type gender country_part date_of_birth household_id household_role
0 453 business NaN south NaN NaN NaN
1 608 business NaN north_east NaN NaN NaN
2 980 business NaN north_east NaN NaN NaN
3 997 business NaN north_east NaN NaN NaN
4 1298 business NaN centre NaN NaN NaN